Amiga CD32 Gamer 14

home *** CD-ROM | disk | FTP | other *** search

/ Amiga CD32 Gamer 14 / CD32 Gamer - 1995 - Issue 14.iso / fli / flick15.lha / src / c2p_020.s < prev next >

Wrap

Text File | 1994-11-13 | 7KB | 309 lines

; Chunky2Planar algorithm, originally by James McCoull ; Modified by Peter McGavin for variable size and depth ; and "dirty list" (hope I didn't slow it down too much) ; ; Cpu only solution ; Optimised for 020+fastram ; Aim for less than 90ms for 320x200x256 on 14MHz 020 ;void __asm c2p_8 (register __a0 UBYTE *chunky_data, ; register __a1 PLANEPTR raster, ; register __a2 UBYTE *dirty_list, ; register __d1 ULONG plsiz, ; register __a5 UBYTE *tmp_buffer); ; a0 -> width*height chunky pixels in fastmem ; a1 -> contiguous bitplanes in chipmem ; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating) ; d1 = width*height/8 (width*height must be a multiple of 32) ; a5 -> width*height tmp buffer in fastmem ifeq depth-8 xdef _c2p_8_020 _c2p_8_020: else ifeq depth-6 xdef _c2p_6_020 _c2p_6_020: else ifeq depth-4 xdef _c2p_4_020 _c2p_4_020: else fail "unsupported depth!" endc endc endc wordmerge macro ; i1 i2 tmp ; \1 \2 \3 move.l \2,\3 ;\3 = CD move.w \1,\2 ;\2 = CB swap \2 ;\2 = BC move.w \2,\1 ;\1 = AC move.w \3,\2 ;\2 = BD endm merge macro ; io in out tmp msk sft ; \1 \2 \3 \4 \5 \6 ; \1 = abqr ; \2 = ijyz move.l \5,\3 ; \3 = 0x0x move.l \3,\4 ; \4 = 0x0x and.l \1,\3 ; \3 = 0b0r and.l \2,\4 ; \4 = 0j0z eor.l \3,\1 ; \1 = a0q0 eor.l \4,\2 ; \2 = i0y0 lsr.l #\6,\2 ; \2 = 0i0y ifeq \6-1 add.l \3,\3 else lsl.l #\6,\3 ; \3 = b0r0 endc or.l \2,\1 ; \1 = aiqy or.l \4,\3 ; \3 = bjrz endm merge4 macro ; io in out tmp msk ; \1 \2 \3 \4 \5 ; \1 = abqr ; \2 = ijyz ifgt depth-4 move.l \5,\3 ; \3 = 0x0x move.l \3,\4 ; \4 = 0x0x and.l \1,\3 ; \3 = 0b0r and.l \2,\4 ; \4 = 0j0z eor.l \3,\1 ; \1 = a0q0 eor.l \4,\2 ; \2 = i0y0 lsr.l #4,\2 ; \2 = 0i0y or.l \2,\1 ; \1 = aiqy move.l \1,(a5)+ ; write to tmp buffer lsl.l #4,\3 ; \3 = b0r0 or.l \4,\3 ; \3 = bjrz move.l \3,(a5)+ ; write to tmp buffer else move.l \5,\3 ; this version returns only 1 result and.l \3,\2 ; \2 = 0j0z and.l \1,\3 ; \3 = 0b0r lsl.l #4,\3 ; \3 = b0r0 or.l \2,\3 ; \3 = bjrz move.l \3,(a5)+ ; write to tmp buffer endc endm merge1 macro ; io in out tmp msk flg ; \1 \2 \3 \4 \5 \6 ; \1 = abqr ; \2 = ijyz move.l \5,\3 ; \3 = 0x0x move.l \3,\4 ; \4 = 0x0x and.l \1,\3 ; \3 = 0b0r and.l \2,\4 ; \4 = 0j0z eor.l \3,\1 ; \1 = a0q0 eor.l \4,\2 ; \2 = i0y0 lsr.l #1,\2 ; \2 = 0i0y or.l \2,\1 ; \1 = aiqy move.l \1,(a2) ; write to output plane suba.l a5,a2 ; -plsiz add.l \3,\3 ; \3 = b0r0 or.l \4,\3 ; \3 = bjrz ifne \6 move.l \3,(a2) ; write to output plane suba.l a5,a2 ; -plsiz endc endm start: jmp next ; self-modified code here next: movem.l d1/a0-a1/a6,-(sp) ; Relocate c2p so that firstsweep2 is at a quad-longword-aligned address. ; Firstsweep2 loop doesn't fit in '020/'030 cache unless it is exactly aligned. ; Speed penalty of misalignment is about 30%. lea (firstsweep2,pc),a0 move.l a0,d0 and.w #%00001111,d0 ; relocate by -d0.w bytes lea (c2p,pc),a0 ; a0 = src movea.l a0,a1 sub.w d0,a1 ; a1 = dst move.l a1,start+2 ; patch jmp move.w #(end-c2p)/2-1,d0 loop: move.w (a0)+,(a1)+ ; relocate code loop dbra d0,loop move.l (4).w,a6 ; flush cache jsr (_LVOCacheClearU,a6) movem.l (sp)+,d1/a0-a1/a6 bra.b start ; restart ds.w 8 ; space for relocation of c2p routine ; the real c2p routine starts here c2p: movem.l d2-d7/a2-a6,-(sp) sub.w #24,sp ; space for temporary variables ; a0 = chunky buffer ; a1 = output area ; a2 = dirty list ; d1 = plsiz ; a5 = tmp buffer move.l a1,(4,sp) ; save output address move.l a2,(8,sp) ; save dirty list ptr move.l d1,(12,sp) ; save plsiz lsl.l #3,d1 movea.l a0,a1 adda.l d1,a1 ; a1 -> end of chunky buffer sub.l (12,sp),d1 ifle depth-6 sub.l (12,sp),d1 sub.l (12,sp),d1 endc ifle depth-4 sub.l (12,sp),d1 sub.l (12,sp),d1 endc move.l d1,(16,sp) ; save 7*plsiz (or 5*plsiz) (or 3*plsiz) move.l a5,(20,sp) ; save tmp buffer address ;; Sweep thru the whole chunky data once, ;; Performing 3 merge operations on it. move.l #$00ff00ff,a3 ; load byte merge mask move.l #$0f0f0f0f,a4 ; load nibble merge mask ; pass 1 firstsweep: tst.b (a2)+ ; does next 32 pixel unit need updating? bne.b firstsweep3 adda.w #32,a0 ; skip 32 pixels on input/output cmpa.l a0,a1 bne.b firstsweep bra.w exit ; exit if no changes ; this becomes the first sweep's main loop after the first change is found firstsweep2: tst.b (a2)+ ; does next 32 pixel unit need updating? bne.b firstsweep3 adda.w #32,a0 ; skip 32 pixels on input cmpa.l a0,a1 bne.b firstsweep2 bra.w secondsweep ; on to second sweep if changes firstsweep3: movem.l (a0)+,d0-d7 ; get 32 pixels in registers ; d0-7 = abcd efgh ijkl mnop qrst uvwx yzAB CDEF wordmerge d0,d4,a6 ;d0/4 = abqr cdst wordmerge d1,d5,a6 ;d1/5 = efuv ghwx wordmerge d2,d6,a6 ;d2/6 = ijyz klAB wordmerge d3,d7,a6 ;d3/7 = mnCD opEF ; temporarily save off some registers movea.l d7,a6 move.l d6,(sp) ; pass 2 merge d0,d2,d6,d7,a3,8 ;d0/d6 = aiqy bjrz merge d1,d3,d7,d2,a3,8 ;d1/d7 = emuc fnvD ; pass 3 merge4 d0,d1,d2,d3,a4,4 ;d0/d2 = ae74... ae30... merge4 d6,d7,d3,d1,a4,4 ;d6/d3 = bf74... bf30... ; bring them back move.l a6,d7 move.l (sp),d6 ; pass 2 merge d4,d6,d0,d1,a3,8 ;d4/d0 = cksA dltB merge d5,d7,d1,d6,a3,8 ;d5/d1 = gowE hpxF ; pass 3 merge4 d4,d5,d6,d7,a4,4 ;d4/d6 = cg74.. cg30.. merge4 d0,d1,d7,d5,a4,4 ;d0/d7 = dh74.. dh30.. cmpa.l a0,a1 bne.w firstsweep2 ; end of firstsweep, 250 bytes ; only just fits in instr cache ; (a0) ae74.. ae30.. bf74.. bf30.. cg74.. cg30.. dh74.. dh30.. secondsweep: movea.l a5,a1 ; a1 -> end of tmp buffer movea.l (4,sp),a2 ; a2 -> plane0 movea.l (8,sp),a6 ; a6 -> dirty list movea.l (12,sp),a5 ; a5 = plsiz adda.l (16,sp),a2 ; a2 -> plane7 movea.l (20,sp),a0 ; a0 -> tmp buffer movea.l #$33333333,a3 movea.l #$55555555,a4 bra.b secondsweep2 secondsweep3: addq.l #4,a2 ; skip 32 pixels on output secondsweep2: tst.b (a6)+ ; does next 32 pixel unit need updating? beq.b secondsweep3 ifgt depth-4 movem.l (a0)+,d0-d6 ; read tmp buffer, not d7 yet ; save d5 temporarily move.l d5,(sp) ;; pass 4 merge d0,d4,d5,d7,a3,2 ; d0/d5 = aceg76.. aceg54.. merge d2,d6,d7,d4,a3,2 ; d2/d7 = bdhf76.. bdhf54.. ;; pass 5 ifgt depth-6 merge1 d0,d2,d4,d6,a4,1 ; d0/d4 = abcd7... abcd6... endc merge1 d5,d7,d6,d2,a4,1 ; d5/d6 = abcd5... abcd4... ; restore d5 and finally get d7 move.l (sp),d5 move.l (a0)+,d7 else movem.l (a0)+,d1/d3/d5/d7 ; read tmp buf, depth 4 version endc ;; pass 4 merge d1,d5,d4,d6,a3,2 ; d1/d4 = aceg32.. aceg10.. merge d3,d7,d6,d5,a3,2 ; d3/d6 = bdhf32.. bdhf10.. ;; pass 5 merge1 d1,d3,d5,d7,a4,1 ; d1/d5 = abcd3... abcd2... merge1 d4,d6,d7,d3,a4,0 ; d4/d7 = abcd1... abcd0... move.l d7,(a2)+ ; plane 0 adda.l (16,sp),a2 ; +7*plsiz (or 5*plsiz) (or 3*plsiz) cmp.l a0,a1 bne.w secondsweep2 ; end of secondsweep, 216 bytes exit: add.w #24,sp movem.l (sp)+,d2-d7/a2-a6 rts end: end